rm(list = ls())
# install packages (if not installed)
list.of.packages <- c("haven", "dplyr","metafor","lmtest","multiwayvcov","sandwich","metaviz")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# set working directory and import libraries
setwd("P:/My Documents/My Files/UCMETA/UC META-ANALYSIS WORKSHOP (March 2022)")
library(haven) # need this package to load STATA data (.dta) file
library(dplyr) # need this package to organize dataset
library(metafor) # need this package to use meta estimators
library(lmtest) # need this package to correct standatd errors
library(multiwayvcov) # need this package to correct standatd errors
library(sandwich) # need this package to correct standatd errors
library(metaviz) # use this package for forest plot

### Loading dataset
DT <- as.data.frame(read_dta('meta-orig.dta'))

### Definition of basic variables
DT$tstat = DT$t
DT$df = DT$samplesize - DT$k
DT$pcc = DT$tstat/sqrt(DT$tstat^2+DT$df)
DT$varpcc = (1-DT$pcc^2)/DT$df
DT$sepcc = sqrt(DT$varpcc)
DT$avgyear=(DT$end+DT$start)/2
DT$tspan= DT$end-DT$start+1


### Distribution of t-stats and PCCs before kicking out outliers
summary(DT$tstat)
quantile(DT$tstat, probs = seq(0, 1, 0.25), na.rm = TRUE, type=1)
summary(DT$df)
quantile(DT$df, probs = seq(0, 1, 0.25), na.rm = TRUE, type=1)
summary(DT$pcc)
quantile(DT$pcc, probs = seq(0, 1, 0.01), na.rm = TRUE, type=1)


### Kick out extreme values of PCC
rng <- as.numeric(quantile(DT$pcc, probs = c(0.01,0.99), na.rm = TRUE, type=1))
DT2 <- DT[(rng[1]<DT$pcc & DT$pcc<rng[2]),]
summary(DT2$tstat)
summary(DT2$df)
summary(DT2$pcc)


### Characteristics of t-stats and PCCs of final data set
### if not run, remove 'x11()';
x11(); hist(DT2$tstat, breaks=80, main='Histogram of T-statistics', freq = FALSE)

DT2$tstat1 = (DT2$tstat < -2)
DT2$tstat2 = (DT2$tstat >= -2 & DT2$tstat <= 2)
DT2$tstat3 = (DT2$tstat > 2)
summary(as.numeric(DT2$tstat1))
summary(as.numeric(DT2$tstat2))
summary(as.numeric(DT2$tstat3))


### Figure 1 ###
### if not run, remove 'x11()';
x11(); hist(DT2$pcc, breaks=80, main='Histogram of PCC', freq = FALSE)



###############################
### Comparison of alternative "Fixed Effects" estimators
###############################
# Standard FE estimate
FE <- rma(pcc~1, sei=sepcc, data=DT2, method='FE')
print(summary(FE))

# Can get the same estimate doing the following
FEWLS <- lm(pcc~1, weights=1/DT2$sepcc^2, data=DT2)
FEeffects <- as.numeric(FEWLS$coefficients)
FEtstat <- as.numeric(FEWLS$coefficients/sqrt(summary(FEWLS)$cov.unscaled))
cat('FEeffect = ',FEeffects, '; FEtstat = ',FEtstat,'\n')

### TABLE 4; Panel A, FE1
FEweight1 <- lm(pcc~1, weights=1/sepcc^2, data=DT2)
FEweight1_clustered <- coeftest(FEweight1, cluster.vcov(FEweight1, DT2$idstudy)) 
print(FEweight1_clustered)
### The above is Fixed Effects-WLS
### Note coefficents are the same, but standard errors are different, both
### because of WLS and cluster robust standard errors




###############################
### Comparison of alternative "Random Effects" estimators
###############################
REML <- rma(pcc~1, sei=sepcc, data=DT2, method='REML')
summary(REML)

### Can get the same estimate doing the following
DT2$REweight1 <- 1/(REML$tau2+DT2$sepcc^2)
REWLS <- lm(pcc~1, weights=DT2$REweight1, data=DT2)
REeffects <- as.numeric(REWLS$coefficients)
REtstat <- as.numeric(REWLS$coefficients/sqrt(summary(REWLS)$cov.unscaled))
cat('REeffect = ',REeffects, '; REtstat = ',REtstat,'\n')

### TABLE 4; Panel A, RE1
REWLS <- lm(pcc~1, weights=DT2$REweight1, data=DT2)
REWLS_clustered <- coeftest(REWLS, cluster.vcov(REWLS, DT2$idstudy)) 
print(REWLS_clustered)
### And this is Random Effects-WLS
### Again, coefficents are the same, but standard errors are different


###############################
### Determining the weight of individual studies (Fixed Effects)
###############################
### Three studies account for approximately 79% of the weight
### As a result, prefer Random Effects in subsequent analysis
subDT <- subset(DT2, select=c('idstudy','pcc','sepcc'))
subDT$FEweight1 <- 1/subDT$sepcc^2
subDT$pccFE <- subDT$pcc*subDT$FEweight1
subDT2 <- as.data.frame(subDT %>%
  group_by(idstudy) %>%
  summarise(
    pccFE = sum(pccFE, na.rm = T),
    FEweight1 = sum(FEweight1, na.rm = T),
    max = max(pcc, na.rm = T),
    min = min(pcc, na.rm = T)
  ))
subDT2$avgpccFE <- subDT2$pccFE/subDT2$FEweight1
subDT2$avgsepccFE <- sqrt(1/subDT2$FEweight1)
subDT2$WT <- 100*subDT2$FEweight1/sum(subDT2$FEweight1)
subDT2$lower <- subDT2$avgpccFE - 1.96*subDT2$avgsepccFE
subDT2$upper <- subDT2$avgpccFE + 1.96*subDT2$avgsepccFE
View(cbind.data.frame(StudyID = subDT2$idstudy,
                      FEeffect=subDT2$avgpccFE, 
                      LowerBound=subDT2$lower,
                      UpperBound=subDT2$upper,
                      WT=subDT2$WT))
summary(as.numeric(subDT2$WT))
subDT2 <- subDT2[order(-subDT2$WT),]
head(subDT2)

### making forest plot using 'metaviz' package
### if not run, remove 'x11()';
x11(); viz_thickforest(x = subDT[, c("pcc", "sepcc")], 
                       group = subDT[, "idstudy"],
                       type="summary_only",
                       method = "FE",
                       annotate_CI=TRUE)
### making forest plot using 'metafor' package
x11(); forest(subDT2$avgpccFE, subDT2$avgsepccFE^2, slab=subDT2$idstudy, psize=1)




###############################
### Determining the weight of individual studies (Random Effects)
###############################
subDT <- subset(DT2, select=c('idstudy','pcc','sepcc'))
REtau2 <- rma(pcc~1, sei=sepcc,  data=subDT, method='REML')$tau2
subDT$REweight1 <- 1/(REtau2 + subDT$sepcc^2)
subDT$pccRE <- subDT$pcc*subDT$REweight1
subDT2 <- as.data.frame(subDT %>%
                          group_by(idstudy) %>%
                          summarise(
                            pccRE = sum(pccRE, na.rm = T),
                            REweight1 = sum(REweight1, na.rm = T),
                            max = max(pcc, na.rm = T),
                            min = min(pcc, na.rm = T)
                          ))
subDT2$avgpccRE <- subDT2$pccRE/subDT2$REweight1
subDT2$avgsepccRE <- sqrt(1/subDT2$REweight1)
subDT2$WT <- 100*subDT2$REweight1/sum(subDT2$REweight1)
subDT2$lower <- subDT2$avgpccRE - 1.96*subDT2$avgsepccRE
subDT2$upper <- subDT2$avgpccRE + 1.96*subDT2$avgsepccRE
View(cbind.data.frame(StudyID = subDT2$idstudy,
                      REeffect=subDT2$avgpccRE, 
                      LowerBound=subDT2$lower,
                      UpperBound=subDT2$upper,
                      WT=subDT2$WT))
summary(as.numeric(subDT2$WT))
subDT2 <- subDT2[order(-subDT2$WT),]
head(subDT2)

### making forest plot using 'metaviz' package
### if not run, remove 'x11()';
x11(); viz_thickforest(x = subDT[, c("pcc", "sepcc")], 
                       group = subDT[, "idstudy"],
                       type="summary_only",
                       method = "REML",
                       annotate_CI=TRUE)
### making forest plot using 'metafor' package
x11(); forest(subDT2$avgpccRE, subDT2$avgsepccRE^2, slab=subDT2$idstudy, psize=1)


###############################
### Number of estimates per study
###############################
subDT <- subset(DT2, select=c('idstudy','pcc','sepcc'))
subDT2 <- as.data.frame(subDT %>%
                          group_by(idstudy) %>%
                          summarise(
                            numberests = length(pcc)
                          ))

### if not run, remove 'x11()';
x11(); hist(subDT2$numberests, breaks=30, main='Number of Estimates', freq = TRUE); box()


###############################
### Weighting by Number of estimates per study + stderror
###############################
subDT <- subset(DT2, select=c('idstudy','pcc','sepcc'))
subDT <- right_join(subDT, subDT2[,c('idstudy','numberests')], by=c('idstudy'))
subDT$FEweight1 <- 1/subDT$sepcc^2
subDT$FEweight2 <- 1/(subDT$numberests*subDT$sepcc^2)
subDT$REweight1 <- 1/(REML$tau2 + subDT$sepcc^2)
subDT$REweight2 <- 1/((REML$tau2 + subDT$sepcc^2)*subDT$numberests)


#############
### TABLE 4 (Panel A, FE2) 
#############
FEWLS <- lm(pcc~1, weights=FEweight2, data=subDT)
FEWLS_clustered <- coeftest(FEWLS, cluster.vcov(FEWLS, subDT$idstudy)) 
print(FEWLS_clustered)

#############
### TABLE 4 (Panel A, RE2) 
#############
REWLS <- lm(pcc~1, weights=REweight2, data=subDT)
REWLS_clustered <- coeftest(REWLS, cluster.vcov(REWLS, subDT$idstudy)) 
print(REWLS_clustered)


###############################
### Egger regressions/FAT-PET regressions
###############################
#############
### TABLE 4 (Panel B, FE1)
#############
FE1WLS <- lm(pcc~sepcc, weights=FEweight1, data=subDT)
FE1WLS_clustered <- coeftest(FE1WLS, cluster.vcov(FE1WLS, subDT$idstudy)) 
print(FE1WLS_clustered)


#############
### TABLE 4 (Panel B, FE2)
#############
FE2WLS <- lm(pcc~sepcc, weights=FEweight2, data=subDT)
FE2WLS_clustered <- coeftest(FE2WLS, cluster.vcov(FE2WLS, subDT$idstudy)) 
print(FE2WLS_clustered)


#############
### TABLE 4 (Panel B, RE1)
#############
RE1WLS <- lm(pcc~sepcc, weights=REweight1, data=subDT)
RE1WLS_clustered <- coeftest(RE1WLS, cluster.vcov(RE1WLS, subDT$idstudy)) 
print(RE1WLS_clustered)


#############
### TABLE 4 (Panel B, RE2)
#############
RE2WLS <- lm(pcc~sepcc, weights=REweight2, data=subDT)
RE2WLS_clustered <- coeftest(RE2WLS, cluster.vcov(RE2WLS, subDT$idstudy)) 
print(RE2WLS_clustered)


###############################
### Characteristics of the data
###############################
DT3 <- DT2 %>% rename(id = idstudy,
                      all = spill_other_firm,
                      spill_reg = sameregion,
                      spill_ind = sameindustry,
                      spill_ex = spill_exporter,
                      value = spill_value,
                      employment = spill_employment,
                      output = spill_output,
                      number = spill_number,
                      asset = spill_assets,
                      rdsp = spill_rd,
                      other_ms = spill_other,
                      ogls = olsgls,
                      plt = probit,
                      nsph = nonsphericalerrors,
                      endg = endogeneity,
                      catg = categorical,
                      sps = sampleselection,
                      fe = panel_fe,
                      agg = aggregate,
                      cs = crosssection,
                      labq = labourquality,
                      prod = productivity,
                      pubyear = publicationyear)


DT3$all[DT3$foreign==1] <- 1
DT3 <- DT3 %>% rename(nondomestic = all)
DT3$other_ms[DT3$asset==1] <- 1
DT3$impact[is.na(DT3$impact)] <- 0
DT3$firm_level <- 1*(DT3$agg==0)


#############
### TABLE 1
#############
sumStat <- function(sumDT){
  sumTable <- as.data.frame(matrix(0, ncol=6, nrow=ncol(sumDT)))
  colnames(sumTable) <- c('Variable','Obs','Mean','Std.Dev','Min','Max')
  sumTable$Variable <- colnames(sumDT)
  for(i in 1:ncol(sumDT)){
    sumTable[i,2] <- length(sumDT[,i])
    sumTable[i,3] <- mean(sumDT[,i])
    sumTable[i,4] <- sd(sumDT[,i])
    sumTable[i,5] <- min(sumDT[,i])
    sumTable[i,6] <- max(sumDT[,i])
  }
  return(sumTable)
}


### Spillover mechanisms
print(sumStat(DT3[,c('spill_ex','spill_reg',
                     'spill_ind','spill_fdi')]))

### Spillover measures
print(sumStat(DT3[,c('number','value',
                     'employment','output',
                     'rdsp','other_ms')]))

### Dependent variable
print(sumStat(cbind.data.frame(binary=DT3[,c('binary')])))

### Data characteristics
print(sumStat(DT3[,c('firm_level','domestic','avgyear')]))

### Countries
print(sumStat(DT3[,c('oecd','eu','developing','china')]))

### Industry group
print(sumStat(DT3[,c('manufacturing','service','it',
                     'food','other_industry')]))

### Estimation type
DT3$other_est <- 1 - DT3$ogls - DT3$plt
print(sumStat(DT3[,c('plt','ogls','other_est')]))

### Estimation type - endogeneity
DT3$endg_sps <- 1*(DT3$endg==1 & DT3$sps==1)
print(sumStat(DT3[,c('endg','sps','fe')]))

### Control variables
print(sumStat(DT3[,c('size','prod','labq','capital','rd')]))

### Study characteristics
print(sumStat(DT3[,c('published','impact','citation')]))


## End of Part I Code







